Load tidyverse and check that you are in the right working directory. You can enter getwd() in the console window and see if the path looks right.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Distance

US Cities

In the lecture we looked at a map of the US and a matrix of distances between 9 major cities Let’s read in the matrix and have a quick look. For tidiness, I have saved it inside a folder called ‘data’

cities <- read_csv('data/city_distance.csv')
## Rows: 9 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): city
## dbl (9): BOS, NY, DC, MIA, CHI, SEA, SF, LA, DEN
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(cities)
## Rows: 9
## Columns: 10
## $ city <chr> "BOS", "NY", "DC", "MIA", "CHI", "SEA", "SF", "LA", "DEN"
## $ BOS  <dbl> 0, 206, 429, 1504, 963, 2976, 3095, 2979, 1949
## $ NY   <dbl> 206, 0, 233, 1308, 802, 2815, 2934, 2786, 1771
## $ DC   <dbl> 429, 233, 0, 1075, 671, 2684, 2799, 2631, 1616
## $ MIA  <dbl> 1504, 1308, 1075, 0, 1329, 3273, 3053, 2687, 2037
## $ CHI  <dbl> 963, 802, 671, 1329, 0, 2013, 2142, 2054, 996
## $ SEA  <dbl> 2976, 2815, 2684, 3273, 2013, 0, 808, 1131, 1307
## $ SF   <dbl> 3095, 2934, 2799, 3053, 2142, 808, 0, 379, 1235
## $ LA   <dbl> 2979, 2786, 2631, 2687, 2054, 1131, 379, 0, 1059
## $ DEN  <dbl> 1949, 1771, 1616, 2037, 996, 1307, 1235, 1059, 0
viewcities <- view(cities)
viewcities
## # A tibble: 9 × 10
##   city    BOS    NY    DC   MIA   CHI   SEA    SF    LA   DEN
##   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 BOS       0   206   429  1504   963  2976  3095  2979  1949
## 2 NY      206     0   233  1308   802  2815  2934  2786  1771
## 3 DC      429   233     0  1075   671  2684  2799  2631  1616
## 4 MIA    1504  1308  1075     0  1329  3273  3053  2687  2037
## 5 CHI     963   802   671  1329     0  2013  2142  2054   996
## 6 SEA    2976  2815  2684  3273  2013     0   808  1131  1307
## 7 SF     3095  2934  2799  3053  2142   808     0   379  1235
## 8 LA     2979  2786  2631  2687  2054  1131   379     0  1059
## 9 DEN    1949  1771  1616  2037   996  1307  1235  1059     0

We will make a new dataframe called md_cities. We remove the column called city to leave only numeric data. We use cmdscale to create a matrix reduced to 2 dimensions. Convert the matrix back to a dataframe and add the column city back for plotting.

md_cities <- select(cities, -city) %>% 
  cmdscale() %>% 
  as.data.frame()
md_cities$city_name <- cities$city
glimpse(md_cities)
## Rows: 9
## Columns: 3
## $ V1        <dbl> -1348.6683, -1198.8741, -1076.9855, -1226.9390, -428.4548, 1…
## $ V2        <dbl> -462.40060, -306.54690, -136.43204, 1013.62838, -174.60316, …
## $ city_name <chr> "BOS", "NY", "DC", "MIA", "CHI", "SEA", "SF", "LA", "DEN"
viewmdcities <- view(md_cities)
viewmdcities
##           V1         V2 city_name
## 1 -1348.6683 -462.40060       BOS
## 2 -1198.8741 -306.54690        NY
## 3 -1076.9855 -136.43204        DC
## 4 -1226.9390 1013.62838       MIA
## 5  -428.4548 -174.60316       CHI
## 6  1596.1594 -639.30777       SEA
## 7  1697.2283  131.68586        SF
## 8  1464.0470  560.58046        LA
## 9   522.4871   13.39576       DEN

Now we can plot x and y to recreate the map. Ooops - its upsidedown.

ggplot(md_cities, aes(x = V1, y = V2)) +
  geom_text(aes(label = city_name))

Turn it back the right way by reversing

ggplot(md_cities, aes(x = V1, y = V2)) +
  geom_text(aes(label = city_name)) +
  scale_x_reverse() +
  scale_y_reverse()

NSS data for 2024

National Student Survey Results

OPTIONAL: You can open up the R script called “2_week_04_NSS24_preprocess.R” If you run the whole script, it will read in a csv of NSS data (from inside the ‘data’ folder), pre-process it, and save the clean version as a csv file (back into the data folder), and also as an R object.

I have already made the R object for you which you can just load directly.

load('./nss_agree.rda')
glimpse(NSS24)
## Rows: 411
## Columns: 10
## $ ukprn    <chr> "10042570", "10067853", "10000163", "10007849", "10007856", "…
## $ provider <chr> "AAP Education Limited", "ACM Guildford Limited", "AECC Unive…
## $ Scale01  <dbl> 0.785, 0.765, 0.788, 0.872, 0.886, 0.955, 0.683, 0.958, 0.827…
## $ Scale02  <dbl> 0.736, 0.740, 0.686, 0.838, 0.867, 1.000, 0.635, 0.888, 0.806…
## $ Scale03  <dbl> 0.768, 0.773, 0.476, 0.835, 0.848, 0.960, 0.740, 0.838, 0.787…
## $ Scale04  <dbl> 0.780, 0.782, 0.836, 0.905, 0.913, 1.000, 0.800, 0.938, 0.810…
## $ Scale05  <dbl> 0.493, 0.538, 0.438, 0.799, 0.853, 0.800, 0.425, 0.750, 0.675…
## $ Scale06  <dbl> 0.485, 0.712, 0.824, 0.907, 0.916, 0.758, 0.628, 0.747, 0.854…
## $ Scale07  <dbl> 0.594, 0.626, 0.540, 0.780, 0.803, 0.933, 0.533, 0.778, 0.723…
## $ exeter   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
nss_agree <- NSS24

Distance between the University of Exeter and the London School of Economics.

nss_exeter <- nss_agree %>% 
  filter(provider == 'University of Exeter')
print(nss_exeter)
## # A tibble: 1 × 10
##   ukprn  provider Scale01 Scale02 Scale03 Scale04 Scale05 Scale06 Scale07 exeter
##   <chr>  <chr>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <lgl> 
## 1 10007… Univers…   0.869   0.802    0.71   0.852   0.812   0.867   0.722 TRUE
nss_lse <- nss_agree %>% 
  filter(provider == 'The London School of Economics and Political Science')
print(nss_lse)
## # A tibble: 1 × 10
##   ukprn  provider Scale01 Scale02 Scale03 Scale04 Scale05 Scale06 Scale07 exeter
##   <chr>  <chr>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <lgl> 
## 1 10004… The Lon…   0.875   0.801   0.732   0.877    0.82   0.886   0.741 FALSE

Euclidean Distance:

Take the difference of each item, square each difference and add it all up. Take the squareroot.

sqrt((nss_exeter$Scale01 - nss_lse$Scale01)^2 +
(nss_exeter$Scale02 - nss_lse$Scale02)^2 +
(nss_exeter$Scale03 - nss_lse$Scale03)^2 +
(nss_exeter$Scale04 - nss_lse$Scale04)^2 +
(nss_exeter$Scale05 - nss_lse$Scale05)^2 +
(nss_exeter$Scale06 - nss_lse$Scale06)^2 +
(nss_exeter$Scale07 - nss_lse$Scale07)^2)
## [1] 0.04395452

Distance between Universities of Exeter and Bristol

nss_bristol <- nss_agree %>% 
  filter(provider == 'University of Bristol')
print(nss_bristol)
## # A tibble: 1 × 10
##   ukprn  provider Scale01 Scale02 Scale03 Scale04 Scale05 Scale06 Scale07 exeter
##   <chr>  <chr>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <lgl> 
## 1 10007… Univers…   0.856   0.788   0.686    0.85   0.727   0.864   0.718 FALSE
sqrt((nss_exeter$Scale01 - nss_bristol$Scale01)^2 +
(nss_exeter$Scale02 - nss_bristol$Scale02)^2 +
(nss_exeter$Scale03 - nss_bristol$Scale03)^2 +
(nss_exeter$Scale04 - nss_bristol$Scale04)^2 +
(nss_exeter$Scale05 - nss_bristol$Scale05)^2 +
(nss_exeter$Scale06 - nss_bristol$Scale06)^2 +
(nss_exeter$Scale07 - nss_bristol$Scale07)^2)
## [1] 0.09052624
nss_agree %>% filter(grepl('*Bristol*', provider))
## # A tibble: 4 × 10
##   ukprn  provider Scale01 Scale02 Scale03 Scale04 Scale05 Scale06 Scale07 exeter
##   <chr>  <chr>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <lgl> 
## 1 10001… City of…   0.956   0.906   0.918   0.912   0.882   0.392   0.804 FALSE 
## 2 10023… Trinity…   1       1       0.982   1       1       1       1     FALSE 
## 3 10007… Univers…   0.856   0.788   0.686   0.85    0.727   0.864   0.718 FALSE 
## 4 10007… Univers…   0.836   0.813   0.758   0.832   0.725   0.873   0.714 FALSE

Compute all the distances simultaneously - need to do a bit of re-shaping. Requires numeric data only so need to create a numeric matrix.

nss_m <- nss_agree %>% 
  select(-ukprn, -provider, -exeter) %>% 
  as.matrix
rownames(nss_m) <- nss_agree$provider
glimpse(nss_m)
##  num [1:411, 1:7] 0.785 0.765 0.788 0.872 0.886 0.955 0.683 0.958 0.827 0.958 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:411] "AAP Education Limited" "ACM Guildford Limited" "AECC University College" "Abertay University" ...
##   ..$ : chr [1:7] "Scale01" "Scale02" "Scale03" "Scale04" ...

All pairs of Euclidean distances - check the size of the output

nss_dist <- dist(nss_m)

If we turn this into a matrix, and look at only the first three schools we see the distances. Along the diagonal the distance is 0, because that’s the distance of a school to itself.

as.matrix(nss_dist)[1:3, 1:3]
##                         AAP Education Limited ACM Guildford Limited
## AAP Education Limited               0.0000000             0.2345698
## ACM Guildford Limited               0.2345698             0.0000000
## AECC University College             0.4601858             0.3528597
##                         AECC University College
## AAP Education Limited                 0.4601858
## ACM Guildford Limited                 0.3528597
## AECC University College               0.0000000

Quick summary stats of the distances.

summary(nss_dist)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01483 0.18326 0.27121 0.31168 0.39705 1.38850

Here’s a histogram of the distances. They tend to be between 0 and 0.5.

as.numeric(nss_dist) %>% 
  hist(breaks = 100, main = "Histogram of all distances")

TASK 1

Which University is the data is the most different from the University of Exeter? Which is the most similar?

as.matrix(nss_dist)[,“University of Exeter”] will produce a list of all the schools. Can you work out the highest and lowest from the code below?

#your code here
matrix_list <- as.matrix(nss_dist)[,"University of Exeter"]
matrix_list[matrix_list ==0] <- NA
most_similar <- which.min(matrix_list)
most_different <- which.max(matrix_list)
as.matrix(nss_dist)[,"University of Exeter"] %>% sort %>% head
##                                 University of Exeter 
##                                           0.00000000 
##                                 University of Sussex 
##                                           0.03196873 
## The London School of Economics and Political Science 
##                                           0.04395452 
##                        University of Nottingham, The 
##                                           0.05233546 
##                            University of Southampton 
##                                           0.05273519 
##                        Queen's University of Belfast 
##                                           0.06038212

Multidimensional Scaling

nss_mds <- cmdscale(nss_dist) %>% 
  as.data.frame() %>% 
  rownames_to_column(var = 'provider') %>% 
  rename(mds1 = V1, mds2 = V2)
ggplot(nss_mds, aes(x = mds1, y = mds2)) +
  geom_point()

Adding text is a total mess

ggplot(nss_mds, aes(x = mds1, y = mds2)) +
  geom_point() +
  geom_text(aes(label=provider))

We can use ‘filter’ to select points of interest for labeling

library(ggrepel)
ggplot(nss_mds, aes(x = mds1, y = mds2)) +
  geom_point() +
  geom_text_repel(data = filter(nss_mds, mds1 > 0.45), 
                  aes(label = provider))

Any outliers outside a unit circle? This will label only rows where the Euclideon distance is from the origin is >1

ggplot(nss_mds, aes(x = mds1, y = mds2)) +
  geom_point() +
  geom_text_repel(data = filter(nss_mds,sqrt(mds1^2 + mds2^2) >1),
                  aes(label = provider))

Find University of Exeter

ggplot(nss_mds, aes(x = mds1, y = mds2)) +
  geom_point() +
  geom_text_repel(data = filter(nss_mds,provider=='University of Exeter'),
                  aes(label = provider))

ggplot(nss_mds, aes(x = mds1, y = mds2)) +
  geom_point(aes(color = ifelse(provider == 'University of Exeter', 'Exeter', 'Other'))) +
  scale_color_manual(values = c('Exeter' = 'red', 'Other' = 'black')) +
  geom_text_repel(data = filter(nss_mds, provider == 'University of Exeter'),
                  aes(label = provider),
                  nudge_x = 0.5, # Optional: Adjust text nudge to avoid overlap
                  nudge_y = 0.5) + # Optional: Adjust text nudge to avoid overlap
  theme(legend.position = "none") # Hide the legend, assuming it's not needed

Dimensionality of Data

Correlation Plot

library(corrplot)
## corrplot 0.95 loaded
nss_cor <- nss_agree %>% 
  select_if(is.numeric) %>% 
  cor

What do you think about the correlated relationships? Which variable stands out from all the others? Can you think why? You may need to look at the original dataset to see what each of the themes are.

corrplot(nss_cor)

If you really don’t like that the default label colour and style:

corrplot(nss_cor, method = 'color', tl.col = 'grey20')

Pairs plot

Create a scatter plot of Scale01 against Scale02

ggplot(nss_agree, aes(x = Scale01, y = Scale02)) +
  geom_point()

Create scatter plots for all pairs - you’ll use this for your final report

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
nss_agree %>% 
  select_if(is.numeric) %>% 
  ggpairs()

Another way to look at some other plots ## 3D interactive plot

Can you find Exeter?

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

https://plot.ly/r/3d-scatter-plots/

plot_ly(nss_agree, x = ~Scale01, y = ~Scale02, z = ~Scale06, color = ~exeter, size = 2,
        text = ~paste('University:', provider)) %>% 
  add_markers()
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

Principal Component Analysis

library(FactoMineR)
nss_pca <- PCA(nss_m)

library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

It looks like every single school can be described by a single number, they can all be ranked on a single dimension!

fviz_eig(nss_pca)

fviz_pca_var(nss_pca)

Lollipop plot.

nss_pca$var$contrib %>% 
  as.data.frame %>% 
  rownames_to_column(var = 'scale') %>% 
  gather(pca, value, -scale) %>% 
  ggplot() +
  geom_point(aes(y = scale, x = value)) +
  geom_segment(x = 0, aes(xend = value, y = scale, yend = scale)) +
  facet_wrap(~ pca)

PCA variable, individual plots, and biplots

fviz_pca_ind(nss_pca)

fviz_pca_ind(nss_pca, geom = 'point')

fviz_pca_biplot(nss_pca, geom = 'point')

Out of interest, let’s look at Dimensions 2 and 3

fviz_pca_biplot(nss_pca, geom = 'point', axes = c(2,3))

Merge in the PCA results.

nss_pca_ind <- 
  nss_pca$ind %>% 
  as.data.frame() %>% 
  rownames_to_column(var = 'provider')
nss <- left_join(nss_agree, nss_pca_ind, by = 'provider')

Plot in 3D using first three dimensions

plot_ly(nss, x = ~coord.Dim.1, y = ~coord.Dim.2, z = ~coord.Dim.3, color = ~exeter, 
        size = 2, colors = c('#2C5B9B', '#F37267'),
        text = ~paste('University:', provider)) %>% 
  add_markers()

Clustering

Hierarchical clustering

cities_hc <- hclust(select(cities, -city) %>% as.dist, method = 'ave')
plot(cities_hc)

Play around with the miles_apart value. What is the best place to cut the tree to give you a good clustering?

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
miles_apart <- 300
md_cities$cluster <- cutree(cities_hc, h = miles_apart)
p1 <- ggplot(md_cities, aes(x = V1, y = V2, color = as.factor(cluster))) +
  scale_x_reverse('MDS 1') +
  scale_y_reverse('MDS 2') +
  geom_text(aes(label = city_name)) +
  theme_bw()
p2 <- fviz_dend(cities_hc) +
  geom_hline(yintercept = miles_apart, linetype = 2)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
grid.arrange(p1, p2, nrow = 1)

What happens when we do use hierarchical clustering on the most recent NSS data?

nss_hc <- hclust(nss_dist)
nss_hc$labels <- abbreviate(nss_hc$labels, minlength = 10) %>% as.character
plot(nss_hc)

fviz_dend(nss_hc, rotate = T) +
  theme(text = element_text(size = 10))

nss_d <- select(nss_agree, starts_with('Scale')) %>% 
  data.frame
rownames(nss_d) <- nss_agree$provider

Task:

Try varying the number of clusters by changing k to different values. NOTE: These are randomly generated clusters

How does it impact the WSS and Silhouette recommendations for the number of clusters? How many clusters would you choose compared to how many it recommends?

#set seed so we get the same results each time
set.seed(666)

# Parameters
k <- 5 # how many clusters do we want?
N <- 500

# Generate cluster centers
xs <- runif(k, 0, 10)
ys <- runif(k, 0, 10)

# Generate data for each cluster
d <- data.frame(
  xs = unlist(lapply(xs, function(x) rnorm(N, mean = x, sd = 0.5))),
  ys = unlist(lapply(ys, function(y) rnorm(N, mean = y, sd = 0.5))),
  cluster = factor(rep(1:k, each = N)) # Add a 'cluster' column
)

# Plot the data
ggplot(d, aes(x = xs, y = ys, color = cluster)) +
  geom_point(alpha = 0.5) +
  theme_minimal() +
  scale_color_discrete(name = "Cluster")

fviz_nbclust(d,kmeans, method = 'wss')

fviz_nbclust(d,kmeans, method = 'silhouette')

Now let’s do k-means clustering on our generated dataset. Run the cell multiple times - what do you see?

The initial placement of centroids affects the final clustering result.

# Perform k-means clustering (use only numeric columns)
kmeans_result <- kmeans(d[, c("xs", "ys")], centers = k)

# Visualize clusters using factoextra
fviz_cluster(kmeans_result, data = d[, c("xs", "ys")], geom = "point")

Clustering NSS

fviz_nbclust(nss_d,hcut, method = 'wss')

fviz_nbclust(nss_d,kmeans, method = 'wss')

fviz_nbclust(nss_d,hcut, method = 'silhouette')

fviz_nbclust(nss_d,kmeans, method = 'silhouette')

Task

Play with the number of clusters. What happens at N = 7?

The overlap in your plot when visualising k-means is due to the underlying data distribution. As k-means works by miminisng the sum of squared distances between data points and their assigned cluster centroids, it assumes clusters are roughly spherical, equally sized, and non-overlapping. The dimension appear to overlap when visualised in 2D.

Hierarchical clustering can capture more complex cluster shapes.

N <- 3
nss_km <- kmeans(nss_d, centers = N, nstart = 20)
p1 <- fviz_cluster(nss_km, nss_d, geom = 'point') +
  theme_minimal() +
  labs(title = paste('K-Means Clustering (k =',N,')'))

nss_hcut <- hcut(nss_d, k = N)
p2 <- fviz_cluster(nss_hcut, nss_d, geom = 'point') +
  theme_minimal() +
  labs(title = paste('Hierarchical Clustering (k =',N,')'))

grid.arrange(p1, p2)

Instead of a scatterplot showing the clusters in 2D space, we are using a density plot that shows distribution of data points along a single dimension. Each cluster is represented as a single density curve. You can see how much overlap exists between clusters in dimension 1.

nss_km <- kmeans(nss_d, centers = N, nstart = 20)
nss$km_cluster <- factor(nss_km$cluster)
ggplot(nss, aes(x = coord.Dim.1, fill = km_cluster)) +
  geom_density(color = 'white', alpha = 0.5) +
  theme_minimal()

nss_km$centers
##     Scale01   Scale02   Scale03   Scale04   Scale05   Scale06   Scale07
## 1 0.7902308 0.7520577 0.7078462 0.7730962 0.4778077 0.6826154 0.6057308
## 2 0.8644980 0.8353306 0.8038327 0.8636939 0.7274939 0.8360000 0.7418122
## 3 0.9481930 0.9303421 0.9196930 0.9448246 0.8805965 0.8764737 0.8780526
plot_ly(nss, x = ~Scale01, y = ~Scale02, z = ~Scale06, color = ~km_cluster, size = 2,
        text = ~paste('University:', provider)) %>% 
  add_markers()
plot_ly(nss, x = ~coord.Dim.1, y = ~coord.Dim.2, z = ~coord.Dim.3, color = ~km_cluster, size = 2,
        text = ~paste('University:', provider)) %>% 
  add_markers()

which cluster is Exeter in ?

nss %>% filter(provider == "University of Exeter") %>% pull(km_cluster)
## University of Exeter 
##                    2 
## Levels: 1 2 3
nss %>% filter(provider == "University of Exeter") %>% pull(coord.Dim.1)
## [1] -0.776422

We can filter parts of the dataset if we want to look in more detail

high_dim <-nss %>% filter(coord.Dim.1 < -4)

Let’s filter to visualise some of the universities close to Exeter using just PCA dimension 1

# Define the range around University of Exeter's coord.Dim.1 value
exeter_coord <- -0.77
range <- 0.2  # Adjust this range as needed

# Filter universities within the specified range
near_exeter <- nss %>%
  filter(coord.Dim.1 > (exeter_coord - range) & coord.Dim.1 < (exeter_coord + range))

# Plot the filtered universities
ggplot(near_exeter, aes(x = coord.Dim.1, y = coord.Dim.2, label = provider)) +
  geom_point(color = "blue", size = 3) +
  geom_text_repel() +
  theme_minimal() +
  labs(
    x = "Coordinate Dimension 1",
    y = "Coordinate Dimension 2"
  )
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps